labels
# get a boolean vector of training labels
diseaseLabels <- diseaseInfo %>%
select(humansAffected) %>% # get the column with the # of humans affected
is.na() %>% # is it NA?
magrittr::not() # switch TRUE and FALSE (using function from the magrittr package)
# check out the first few lines
head(diseaseLabels) # of our target variable
## humansAffected
## [1,] TRUE
## [2,] FALSE
## [3,] FALSE
## [4,] FALSE
## [5,] FALSE
## [6,] FALSE
head(diseaseInfo$humansAffected) # of the original column
## [1] 1 NA NA NA NA NA
# select just the numeric columns
diseaseInfo_numeric <- diseaseInfo_humansRemoved %>%
select(-Id) %>% # the case id shouldn't contain useful information
select(-c(longitude, latitude)) %>% # location data is also in country data
select_if(is.numeric) # select remaining numeric columns
# make sure that our dataframe is all numeric
str(diseaseInfo_numeric)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17008 obs. of 5 variables:
## $ sumAtRisk : num NA 53 NA 61 93 12 103 49 13 NA ...
## $ sumCases : num NA 4 1 1 1 NA 1 9 10 1 ...
## $ sumDeaths : num NA 0 1 0 0 6 NA 0 10 0 ...
## $ sumDestroyed : num NA 0 0 0 0 6 NA 0 3 1 ...
## $ sumSlaughtered: num NA 0 0 0 0 NA NA 0 0 0 ...
head(diseaseInfo$country)
## [1] "Saudi Arabia" "Italy" "Poland"
## [4] "Tunisia" "France" "Republic of Korea"
# one-hot matrix for just the first few rows of the "country" column
model.matrix(~country-1,head(diseaseInfo))
## countryFrance countryItaly countryPoland countryRepublic of Korea
## 1 0 0 0 0
## 2 0 1 0 0
## 3 0 0 1 0
## 4 0 0 0 0
## 5 1 0 0 0
## 6 0 0 0 1
## countrySaudi Arabia countryTunisia
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 0
## attr(,"assign")
## [1] 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$country
## [1] "contr.treatment"
# convert categorical factor into one-hot encoding
region <- model.matrix(~country-1,diseaseInfo)
# some of the species
head(diseaseInfo$speciesDescription)
## [1] NA
## [2] "domestic, cattle"
## [3] "wild, wild boar"
## [4] "domestic, cattle, domestic, goat, domestic, sheep"
## [5] "domestic, cattle"
## [6] "domestic, unspecified bird"
# add a boolean column to our numeric dataframe indicating whether a species is domestic
diseaseInfo_numeric$is_domestic <- str_detect(diseaseInfo$speciesDescription, "domestic")
# get a list of all the species by getting the last
speciesList <- diseaseInfo$speciesDescription %>%
str_replace("[[:punct:]]", "") %>% # remove punctuation (some rows have parentheses)
str_extract("[a-z]*$") # extract the least word in each row
# convert our list into a dataframe...
speciesList <- tibble(species = speciesList)
# and convert to a matrix using 1 hot encoding
options(na.action='na.pass') # don't drop NA values!
species <- model.matrix(~species-1,speciesList)
# add our one-hot encoded variable and convert the dataframe into a matrix
diseaseInfo_numeric <- cbind(diseaseInfo_numeric, region, species)
diseaseInfo_matrix <- data.matrix(diseaseInfo_numeric)
# get the numb 70/30 training test split
numberOfTrainingSamples <- round(length(diseaseLabels) * .7)
# training data
train_data <- diseaseInfo_matrix[1:numberOfTrainingSamples,]
train_labels <- diseaseLabels[1:numberOfTrainingSamples]
# testing data
test_data <- diseaseInfo_matrix[-(1:numberOfTrainingSamples),]
test_labels <- diseaseLabels[-(1:numberOfTrainingSamples)]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = train_data, label= train_labels)
dtest <- xgb.DMatrix(data = test_data, label= test_labels)
Training model
# train a model using our training data
model <- xgboost(data = dtrain, # the data
nround = 2, # max number of boosting iterations
objective = "binary:logistic") # the objective function
## [1] train-error:0.013943
## [2] train-error:0.013943
# generate predictions for our held-out testing data
pred <- predict(model, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"
Turn our model
# train an xgboost model
model_tuned <- xgboost(data = dtrain, # the data
max.depth = 3, # the maximum depth of each decision tree
nround = 2, # max number of boosting iterations
objective = "binary:logistic") # the objective function
## [1] train-error:0.013943
## [2] train-error:0.013943
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"
# get the number of negative & positive cases in our data
negative_cases <- sum(train_labels == FALSE)
postive_cases <- sum(train_labels == TRUE)
# train a model using our training data
model_tuned <- xgboost(data = dtrain, # the data
max.depth = 3, # the maximum depth of each decision tree
nround = 10, # number of boosting rounds
early_stopping_rounds = 3, # if we dont see an improvement in this many rounds, stop
objective = "binary:logistic", # the objective function
scale_pos_weight = negative_cases/postive_cases) # control for imbalanced classes
## [1] train-error:0.014195
## Will train until train_error hasn't improved in 3 rounds.
##
## [2] train-error:0.014195
## [3] train-error:0.013859
## [4] train-error:0.013859
## [5] train-error:0.013859
## [6] train-error:0.013859
## Stopping. Best iteration:
## [3] train-error:0.013859
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"
# train a model using our training data
model_tuned <- xgboost(data = dtrain, # the data
max.depth = 3, # the maximum depth of each decision tree
nround = 10, # number of boosting rounds
early_stopping_rounds = 3, # if we dont see an improvement in this many rounds, stop
objective = "binary:logistic", # the objective function
scale_pos_weight = negative_cases/postive_cases, # control for imbalanced classes
gamma = 1) # add a regularization term
## [1] train-error:0.014195
## Will train until train_error hasn't improved in 3 rounds.
##
## [2] train-error:0.014195
## [3] train-error:0.013859
## [4] train-error:0.013859
## [5] train-error:0.013859
## [6] train-error:0.013859
## Stopping. Best iteration:
## [3] train-error:0.013859
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"